This vignette introduces the EPI-Clone algorithm in detail. The convenient wrapper function epiclone is described in the main readme of the github repo.

Unsupervised uMAP (Figure 1b,c)

First we load the data and perform an unsupervised dimensionality reduction of all CpGs.

require(Seurat)
require(ggplot2)
require(ROCR)
require(fossil)
require(reshape2)
require(pheatmap)
require(GenomicRanges)
full_seurat <- readRDS(url('https://figshare.com/ndownloader/files/42479346'))
larry <- subset(full_seurat, Experiment == "LARRY main experiment")

We perform straightforward Seurat dimensionality reduction.

usecpg <- rownames(larry)
larry <- ScaleData(larry, assay = "DNAm", features = usecpg, verbose = F)
larry <- RunPCA(larry, assay = "DNAm", features = usecpg, reduction.name = "pca", reduction.key = "PC_", npcs = 100, verbose = F)
larry <- RunUMAP(larry, reduction = "pca", dims = 1:50, verbose = F)
CpGSelection <- read.csv('../infos/cpg_selection.csv', row.names = 1)
selected_not_protein <- row.names(subset(CpGSelection, subset=Type=='Static'))

Identification of big clones vs. small clones (Figure 2i-j)

We perform dimensionality reduction on these CpGs only. Then we look for points in low-density regions (i.e. points which are far away from their nearest neighbors). There are a few parameters here:

The first step is to find the nearest neighbors and compute the distance:

npcs.bigCloneSelection <- 100
k.bigCloneSelection <- 5
thr.bigCloneSelection <- 0

larry <- ScaleData(larry, assay = "DNAm", features = selected_not_protein, verbose = F)
larry <- RunPCA(larry, assay = "DNAm", features = selected_not_protein, reduction.name = "clonePCA", reduction.key = "CLONEPC_", npcs = 100, verbose = F)
larry <- RunUMAP(larry,reduction = "clonePCA", dims = 1:npcs.bigCloneSelection, reduction.name = "cloneUMAP", reduction.key = "CLONEUMAP_" , verbose = F)
larry <- FindNeighbors(larry, reduction = "clonePCA", dims = 1:npcs.bigCloneSelection, k.param = k.bigCloneSelection, return.neighbor=T, graph.name = "clone.neighbors" , verbose = F)
larry$avgNNdist <- apply(larry@neighbors$clone.neighbors@nn.dist,1,function(x) mean(x[x>0]))

This distance estimate seems to be impacted by technical covariates, which we regress out:

to.summarise <- larry@meta.data[, c("ProcessingBatch" ,"CellType","avgNNdist","nFeature_DNAm", "PerformanceNonHhaI")]
m <- lm(avgNNdist ~ nFeature_DNAm + ProcessingBatch + CellType, data = to.summarise)
larry$avgNNres <- residuals(m)

Then there is an optional step, which is to smoothen the resulting estimate locally across a larger group of neighbors. This massively improved performance (from AUPRC ~0.3 to ~0.63)

smoothen.bigCloneSelection <- 20
larry <- FindNeighbors(larry, reduction = "clonePCA", dims = 1:npcs.bigCloneSelection, k.param = smoothen.bigCloneSelection, return.neighbor=T, graph.name = "clone.neighbors" )
larry$avgNNres <- apply(larry@neighbors$clone.neighbors@nn.idx,1, function(x) mean(larry$avgNNres[x]))

larry$selected <- larry$avgNNres < thr.bigCloneSelection

We can plot this quantity against clone size. The point clound on the left are cells with no larry barcode (see above, some of them are from expanded clones and have dropouts). The red line is the threshold used for selection; thisd plot can help defining an appropriate threshold, if clone size information is available.

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

If no clone size information is available, we recommend checking the density estimate on a uMAP and choosing a threshold that selects the big clusters, but not the cells interspersed between cluster.

We can now recreate the plots from figure 1e,f, which visually demonstrate that cells from big clones are correctly selected. Again, we’re only showing the cells that have a LARRY barcode (dropout follows similar distribution). In the first plot, grey dots are from clones with up to 30 cells.

Computation of ROC curves

If LARRY information is available, we can compute ROC curves

trueClone <- "LARRY"
ncells.bigClone <- 30
true_clone <- larry@meta.data[,trueClone]
larry$true_big_clone <-  larry$csize > ncells.bigClone
forAUC <-data.frame( true_small_clones = larry$csize <= ncells.bigClone, 
                     predictor = larry$avgNNres)
    forAUC <- na.omit(forAUC)
    predictions <- prediction( forAUC$predictor, forAUC$true_small_clones)
    perf <- performance(predictions, measure = "tpr", x.measure = "fpr" )

      plot(perf)
      abline(0,1)

      auc <- performance(predictions, measure = "auc")

Here results in an AUC 0.796.

Clustering of big clones only (Figure 1h)

Mostly the clustering shown in the uMAP above is of good quality, but the cells from small clusters often falsely get assigned to some cluster. We therefore prefer to select only the big clones, and re-cluster. Again, this step has a couple of parameters

npcs.Clustering = 100
k.Clustering = 25
res.Clustering=3

for_clustering <- subset(larry, selected)
    for_clustering <- ScaleData(for_clustering, assay = "DNAm", features = selected_not_protein, verbose=F)
    for_clustering <- RunPCA(for_clustering, assay = "DNAm", features = selected_not_protein, reduction.name = "clonePCA", reduction.key = "CLONEPC_", npcs = 100, verbose=F)
    for_clustering <- RunUMAP(for_clustering,reduction = "clonePCA", dims = 1:npcs.Clustering, reduction.name = "cloneUMAP", reduction.key = "CLONEUMAP_" , verbose=F)
    
    for_clustering <- FindNeighbors(for_clustering,reduction = "clonePCA", dims = 1:npcs.Clustering, k.param =k.Clustering, graph.name = "clone_nn", verbose=F)
    for_clustering <- FindClusters(for_clustering, resolution=res.Clustering, graph.name = "clone_nn", verbose=F)

We can now plot the clustering result and the ground truth clonal labels side by side. In gray are now cells without a LARRY barcode (dropouts that are part of big clones).

The Adjusted Rand index is 0.876.

We can also plot the overlap as a heatmap:

## Using unsuper as value column: use value.var to override.

Careful with cluster 4 - a big cluster mostly containing cells from small clones and NA! Likely this is a cluster of cells that falsely passed the original selection of “expanded clones”! We excluded it from the analyses in figure 2f+g.

Native hematopoieses

npcs <- 100
thrbig <- 10
CpGSelection <- read.csv('../infos/cpg_selection.csv', row.names=1)
usecpg <- row.names(subset(CpGSelection, subset=Type=='Static'))
scTAMseq <- full_seurat
young <- epiclone(subset(scTAMseq, Experiment == "Native Hematopoieis"), plotFolder = '.', tuneParams = F, 
               npcs.Clustering = npcs, selected.CpGs = usecpg, trueClone = NULL, batch = "ProcessingBatch", protein.assay.name = NULL, 
               thr.bigCloneSelection = thrbig,npcs.bigCloneSelection = npcs, smoothen.bigCloneSelection = 20, celltype = "CellType",
               returnIntermediateSeurat = T)
## Identifying big clones...
## Warning in irlba(A = t(x = object), nv = npcs, ...): You're computing too large
## a percentage of total singular values, use a standard svd instead.
young <- FindNeighbors(young, k.param = 20, reduction = "clonePCA",dims=1:100, graph.name = "clone_nn")
## Computing nearest neighbor graph
## Computing SNN
## Only one graph name supplied, storing nearest-neighbor graph only
young <- FindClusters(young, resolution=1.1, graph.name = "clone_nn")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 7001
## Number of edges: 66513
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.3800
## Number of communities: 25
## Elapsed time: 0 seconds
## 10 singletons identified. 15 final clusters.
nativebig <- c("12","13","14","11","9","10","8", "6", "5", "7")
young$epiclone <- ifelse(!Idents(young) %in% nativebig, "small", as.character(Idents(young)))
colors <- rep("grey",length(unique(young$epiclone))); names(colors) <- unique(young$epiclone)
colors[names(colors)!="small"] <- RColorBrewer::brewer.pal(length(unique(young$epiclone))-1,name = "Paired")

DimPlot(young, reduction="cloneUMAP", group.by = "epiclone") + scale_color_manual(values = colors, guide=F) + NoAxes() + NoLegend() + ggtitle(NULL)

csize <- table(young$epiclone)
summary <- data.frame(clone = factor(young$epiclone, levels = names(csize)[order(csize)]))
ggplot(aes(fill = clone,x=1),data=summary) + geom_bar() + coord_polar(theta = "y") + scale_fill_manual(values = colors) + theme_bw() + theme(panel.grid = element_blank(), axis.ticks = element_blank(), axis.text =  element_blank(), axis.title = element_blank(), legend.position = "none")

Unsupervised uMAP (Figure 1b,c)

First we load the data and perform an unsupervised dimensionality reduction of all CpGs.

full_seurat <- readRDS(url("https://figshare.com/ndownloader/files/45262066"))

We perform straightforward Seurat dimensionality reduction

usecpg <- rownames(full_seurat)
full_seurat$Mature <- ifelse(full_seurat$CellType%in%c('HSC/MPP1', 'MPP3/4', 'pre/pro-B', 'MEP', 'GMP'), 'Immature', 'Mature')
#full_seurat <- subset(full_seurat, subset=Mature=='Mature')
full_seurat <- ScaleData(full_seurat, assay = "DNAm", features = usecpg, verbose = F)
full_seurat <- RunPCA(full_seurat, assay = "DNAm", features = usecpg, reduction.name = "pca", reduction.key = "PC_", npcs = 100, verbose = F)
full_seurat <- RunUMAP(full_seurat, reduction = "pca", dims = 1:50, verbose = F)

We can highlight on this the Celltype annotation. See the other vignette on how it was obtained.

DimPlot(full_seurat, group.by = "CellType",reduction="umap") + ggtitle("") + NoAxes()

In the main figure of the manuscript we show only the cells that carry a LARRY barcode. The cells where no LARRY barcode was observed (26.9 % of cells - due to dropout, or cells falsely sorted as LARRY+ in the FACS) follow a similar distribution.

full_seurat$use <- ifelse(!is.na(full_seurat$LARRY), "LARRY barcode", "no LARRY barcode")
DimPlot(full_seurat, group.by = "LARRY",reduction="umap", order=cloneorder, split.by = "use") + ggtitle("") + NoAxes() + NoLegend() + scale_color_manual(values = cloneColors.here)

Here, different shades of grey correspond to different LARRY clones represented with up to 30 cells and different colors correspond to different LARRY clones represented with >30 cells.

This plot suggests that overall, the methylome is impacted both by differentiation state and the clonal identity.

Identification of static CpGs (Figure 1d)

To more cleanly identify clones, EPI-Clone first looks for CpGs that are not associated with differentiation. We use surface antigen expression as a proxy for differentiation, since it is completely independent of methylation; alternatively, if you have good cell state annotation obtained from methylation (see also other vignette on cell state), this can be used as well. The epiclone wrapper function can handle both cases.

  #compute minimum pvalue for asociation with (any) protein
  suppressWarnings({
    
  pvals <- apply(full_seurat@assays$DNAm@data,1, function(met) {
    apply(full_seurat@assays$AB@data, 1, function(prot) {
      use <- !is.na(prot)
      a <- prot[use][met[use]==1]
      b <- prot[use][met[use]==0]
      if (length(a) < 3 | length(b) < 3) return(1) else return(ks.test(a,b)$p.value)
    })
  })
  min_pval <- apply(pvals,2,min)
  
  #establish bonferroni criterion
  #thr.protein.ass <- 1/(nrow(full_seurat@assays$DNAm@data) * nrow(full_seurat@assays$AB@data))
  # we need a lower threshold here to have more CpGs
  thr.protein.ass <- 1e-8
  
  #determine average overall methylation level
  avg_meth_rate <- apply(full_seurat@assays$DNAm@data, 1, mean)
  })

We use the LARRY labels to compute, for each CpG, the statistical association with clone (more accurately, any clone bigger than 30 cells). This value is only used for plotting but not for selecting CpGs, so of course, EPI-Clone also workd without clonal labels

trueClone <- "LARRY"
upper.thr.methrate <- 0.9
lower.thr.methrate <- 0.25
ncells.bigClone <- 30
true_clone <- full_seurat@meta.data[,trueClone]
for_prediction <- full_seurat

for_prediction$use <- !is.na(true_clone)
for_prediction <- subset(for_prediction, use)
a <-table(for_prediction@meta.data[,trueClone])
use_for_prediction <- names(a)[a > ncells.bigClone]
for_prediction$use <- for_prediction@meta.data[,trueClone] %in% use_for_prediction
for_prediction <- subset(for_prediction,use)

cloneid <- factor(for_prediction@meta.data[,trueClone], levels = unique(for_prediction@meta.data[,trueClone]))
suppressWarnings({
  pvals_cloneass <- p.adjust(apply(for_prediction@assays[["DNAm"]]@data,1,function(met) {
  if(var(met)==0) return(1)
  chisq.test(table(met,cloneid))$p.value
}),method = "bonferroni")
})

CpGSelection <- data.frame(CpG = names(pvals_cloneass), avg_meth_rate, min_pval, pvals_cloneass)
selected_not_protein <- names(min_pval)[min_pval > thr.protein.ass & avg_meth_rate < upper.thr.methrate & avg_meth_rate > lower.thr.methrate]
panel <- read.table('../infos/panel_info_dropout_pwm.tsv',
                    sep='\t',
                    header=TRUE)

to_plot <- data.frame(PVal=min_pval,
                      AvgMeth=avg_meth_rate,
                      PValClone=pvals_cloneass)

ggplot(to_plot, aes(x = AvgMeth, y = log10(ifelse(PVal<1e-21, 1e-21, PVal)), color = -log10(PValClone+1e-50)))+
  geom_point(size=.5, stroke=.5)+
  geom_hline(yintercept = log10(thr.protein.ass)) + geom_vline(xintercept = c(lower.thr.methrate,upper.thr.methrate)) +
  plot_theme + xlab("Average methylation") + ylab(ifelse(is.null(thr.protein.ass), "p value cell state association", "Association with surface\nprotein [log10]")) +
  scale_color_gradientn(colours = c("black","blue","red"), name = "-log10 p-val\nClone association")+
  scale_y_continuous(breaks=c(0, -7.5, -15), limits = c(-22,5))

The n = 130 dots in the upper central rectangle of this plot (Figure 1D) are of interest as static CpGs and used further for clustering of clones

Identification of big clones vs. small clones (Figure 2i-j)

We perform dimensionality reduction on these CpGs only. Then we look for points in low-density regions (i.e. points which are far away from their nearest neighbors). There are a few parameters here:

The first step is to find the nearest neighbors and compute the distance:

npcs.bigCloneSelection <- 100
k.bigCloneSelection <- 5
thr.bigCloneSelection <- 0

full_seurat <- ScaleData(full_seurat, assay = "DNAm", features = selected_not_protein, verbose = F)
full_seurat <- RunPCA(full_seurat, assay = "DNAm", features = selected_not_protein, reduction.name = "clonePCA", reduction.key = "CLONEPC_", npcs = 100, verbose = F)
full_seurat <- RunUMAP(full_seurat,reduction = "clonePCA", dims = 1:npcs.bigCloneSelection, reduction.name = "cloneUMAP", reduction.key = "CLONEUMAP_" , verbose = F)
full_seurat <- FindNeighbors(full_seurat, reduction = "clonePCA", dims = 1:npcs.bigCloneSelection, k.param = k.bigCloneSelection, return.neighbor=T, graph.name = "clone.neighbors" , verbose = F)
full_seurat$avgNNdist <- apply(full_seurat@neighbors$clone.neighbors@nn.dist,1,function(x) mean(x[x>0]))

This distance estimate seems to be impacted by technical covariates, which we regress out:

performance <- read.csv("../infos/performance_mature.csv", row.names = 1)
full_seurat <- AddMetaData(full_seurat, performance)
to.summarise <- full_seurat@meta.data[, c("Sample" ,"CellType","avgNNdist","nFeature_DNAm", "PerformanceNonHhaI")]
m <- lm(avgNNdist ~ nFeature_DNAm + Sample + CellType + PerformanceNonHhaI, data = to.summarise)
full_seurat$avgNNres <- residuals(m)

Then there is an optional step, which is to smoothen the resulting estimate locally across a larger group of neighbors. This massively improved performance (from AUPRC ~0.3 to ~0.63)

smoothen.bigCloneSelection <- 20
full_seurat <- FindNeighbors(full_seurat, reduction = "clonePCA", dims = 1:npcs.bigCloneSelection, k.param = smoothen.bigCloneSelection, return.neighbor=T, graph.name = "clone.neighbors" )
full_seurat$avgNNres <- apply(full_seurat@neighbors$clone.neighbors@nn.idx,1, function(x) mean(full_seurat$avgNNres[x]))

full_seurat$selected <- full_seurat$avgNNres < thr.bigCloneSelection

We can plot this quantity against clone size. The point clound on the left are cells with no larry barcode (see above, some of them are from expanded clones and have dropouts). The red line is the threshold used for selection; thisd plot can help defining an appropriate threshold, if clone size information is available.

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

If no clone size information is available, we recommend checking the density estimate on a uMAP and choosing a threshold that selects the big clusters, but not the cells interspersed between cluster.

We can now recreate the plots from figure 1e,f, which visually demonstrate that cells from big clones are correctly selected. Again, we’re only showing the cells that have a LARRY barcode (dropout follows similar distribution). In the first plot, grey dots are from clones with up to 30 cells.

Computation of ROC curves

If LARRY information is available, we can compute ROC curves

ncells.bigClone <- 20
  full_seurat$true_big_clone <-  full_seurat$csize > ncells.bigClone
  forAUC <-data.frame( true_small_clones = full_seurat$csize <= ncells.bigClone, 
                       predictor = full_seurat$avgNNres)
      forAUC <- na.omit(forAUC)
      predictions <- prediction( forAUC$predictor, forAUC$true_small_clones)
      perf <- performance(predictions, measure = "tpr", x.measure = "fpr" )
        plot(perf)
        abline(0,1)

        auc <- performance(predictions, measure = "auc")

Here results in an AUC 0.821.

Clustering of big clones only (Figure 1h)

Mostly the clustering shown in the uMAP above is of good quality, but the cells from small clusters often falsely get assigned to some cluster. We therefore prefer to select only the big clones, and re-cluster. Again, this step has a couple of parameters

npcs.Clustering = 100
k.Clustering = 25
res.Clustering=3

full_seurat$Origin <- ifelse(full_seurat$CellType%in%c('macrophages', 'mature myeloid', 'DCs')&full_seurat$Sample=='Lung', 'mature lung', 
                             ifelse(full_seurat$CellType%in%c('macrophages', 'mature myeloid', 'DCs')&full_seurat$Sample=='MyeLym', 'mature BM/PB', ifelse(full_seurat$Sample=='Lung', 'LSK', 'LK')))
for_clustering <- subset(full_seurat, selected)
    for_clustering <- ScaleData(for_clustering, assay = "DNAm", features = selected_not_protein, verbose=F)
    for_clustering <- RunPCA(for_clustering, assay = "DNAm", features = selected_not_protein, reduction.name = "clonePCA", reduction.key = "CLONEPC_", npcs = 100, verbose=F)
    for_clustering <- RunUMAP(for_clustering,reduction = "clonePCA", dims = 1:npcs.Clustering, reduction.name = "cloneUMAP", reduction.key = "CLONEUMAP_" , verbose=F)
    
    for_clustering <- FindNeighbors(for_clustering,reduction = "clonePCA", dims = 1:npcs.Clustering, k.param =k.Clustering, graph.name = "clone_nn", verbose=F)
    for_clustering <- FindClusters(for_clustering, resolution=res.Clustering, graph.name = "clone_nn", verbose=F)

We can now plot the clustering result and the ground truth clonal labels side by side. In gray are now cells without a LARRY barcode (dropouts that are part of big clones).

p3

to_plot <- data.frame(CellType=for_clustering$CellType, Clone=Idents(for_clustering), Sample=for_clustering$Origin)
to_plot <- plyr::count(to_plot)
rel_freq <- c()
for(i in 1:nrow(to_plot)){
  rel_freq <- c(rel_freq, to_plot[i, 'freq']/sum(to_plot[to_plot$Clone==to_plot[i, 'Clone'], 'freq']))
}
to_plot$RelFreq <- rel_freq
plot_ct <- ggplot(to_plot, aes(x=Clone, y=RelFreq, fill=CellType))+geom_histogram(stat='identity')+scale_fill_manual(values=celltypeColors)+plot_theme+ylab('Cell Type Fraction')+facet_wrap(Sample~.)+theme(axis.text.x = element_blank())
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
plot_total <- ggplot(to_plot, aes(x=Clone, y=freq))+geom_histogram(stat='identity', fill='grey25', color='grey25')+plot_theme+ylab('CloneSize')+theme(axis.text.x = element_blank())
## Warning in geom_histogram(stat = "identity", fill = "grey25", color =
## "grey25"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
grid.arrange(plot_ct, plot_total, nrow=2, heights=2:1)

plot_theme <- theme(panel.background = element_rect(color='black',fill='white'),
                          panel.grid=element_blank(),
                          text=element_text(color='black',size=8),
                          axis.text=element_text(color='black',size=8),
                          axis.ticks=element_line(color='black', size=.1),
                          strip.background = element_blank(),
                          legend.position = 'none',
                          strip.text = element_text(color='black',size=8))
cloneSizes <- table(for_clustering@meta.data[,'LARRY'])
for_clustering$CloneForRiver <- as.factor(ifelse(cloneSizes[for_clustering@meta.data[,'LARRY']] <= 10, "small clone", for_clustering@meta.data[,'LARRY']))
forriver <- data.frame(LARRY = factor(for_clustering$CloneForRiver), unsuper = Idents(for_clustering), CellType=for_clustering$CellType)
adj_rand <- sapply(unique(forriver$CellType), function(ct){
  dat <- data.frame(LARRY=forriver[forriver$CellType==ct, 'LARRY'], EPI=forriver[forriver$CellType==ct, 'unsuper'])
  adj.rand.index(as.integer(as.factor(na.omit(dat)$LARRY)), as.integer(as.factor(na.omit(dat)$EPI)))
})
to_plot <- data.frame(CellType=unique(forriver$CellType), AdjRand=adj_rand)
ggplot(to_plot, aes(x=CellType, y=AdjRand, fill=CellType))+geom_histogram(stat='identity')+plot_theme+scale_fill_manual(values=celltypeColors)+
  theme(axis.text.x = element_text(angle=45, hjust=1))
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

For performance evaluation, we thus remove the macrophages from the data.

for_clustering <- subset(for_clustering, CellType!='macrophages')

The Adjusted Rand index is 0.750.

adj.rand.index(as.integer(as.factor(Idents(for_clustering))), as.integer(as.factor(for_clustering@meta.data[,'LARRY'])))
## [1] 0.7503906

We can also plot the overlap as a heatmap:

## Using unsuper as value column: use value.var to override.

to_plot <- data.frame(CellType=for_clustering$CellType, Clone=Idents(for_clustering))
to_plot <- plyr::count(to_plot)
rel_freq <- c()
for(i in 1:nrow(to_plot)){
  rel_freq <- c(rel_freq, to_plot[i, 'freq']/sum(to_plot[to_plot$Clone==to_plot[i, 'Clone'], 'freq']))
}
to_plot$RelFreq <- rel_freq
plot_ct <- ggplot(to_plot, aes(x=Clone, y=RelFreq, fill=CellType))+geom_histogram(stat='identity')+scale_fill_manual(values=celltypeColors)+plot_theme+ylab('Cell Type Fraction')+
  theme(axis.text.x = element_blank())
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
plot_total <- ggplot(to_plot, aes(x=Clone, y=freq))+geom_histogram(stat='identity')+plot_theme+xlab('CloneSize')
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
grid.arrange(plot_ct, plot_total, nrow=2)